1 STRATAA microbiome analysis

1.1 Imports

library(rbiom)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggforce)
library(patchwork)
library(vegan)
library(dplyr)
library(forcats)

1.2 Sources

The file handles are set in config.R as they’re used by both this script and data_cleaning.

source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/core_functions.R")
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/config.R")

1.3 Metadata basics

First do the plots with kruskall wallis comparison between groups to see if there’s an overall difference, then do the pairwise tests because KW says there is a difference.

metadata <- read_metadata(metadata_handle)

metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())

number_per_country %>% kbl() %>% kable_styling()
Country count
Bangladesh 120
Malawi 114
Nepal 76
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
Group count
Acute typhoid 103
Carrier 110
Healthy Control 97
baseline_chars <- get_baseline_characteristics(metadata)

baseline_chars$pct_anti
## [1] 37.50000  0.00000  0.00000 26.47059  0.00000  0.00000 44.82759  0.00000
## [9]  0.00000
baseline_chars$pct_anti %>% na_if(0)
## [1] 37.50000       NA       NA 26.47059       NA       NA 44.82759       NA
## [9]       NA
# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()

# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') 

baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
Country variable_name Acute typhoid Carrier Healthy Control
Bangladesh Median age 6.0 37.0 28.5
Bangladesh Women (%) 47.5 47.5 65.0
Bangladesh Antibiotics in last 2 weeks (%) 37.5 NA NA
Malawi Median age 9.7 32.0 24.0
Malawi Women (%) 64.7 57.5 65.0
Malawi Antibiotics in last 2 weeks (%) 26.5 NA NA
Nepal Median age 17.2 43.9 35.0
Nepal Women (%) 24.1 66.7 82.4
Nepal Antibiotics in last 2 weeks (%) 44.8 NA NA
ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")

comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)

ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)

country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n()) 

plot_sex <- function(eg1, c){
  d <- eg1 %>% filter(Country == c)
  p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) + 
    geom_bar(stat ='identity', position = 'fill') + 
    ylab('Proportion') + 
    ggtitle(c)# +
    #theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
  
  return(p)
}

m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex 

1.4 read in metaphlan data

strataa_metaphlan_data <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.all_strataa_metaphlan.csv'), header= TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE, check.names=FALSE)
strataa_metaphlan_data$lowest_taxonomic_level <- sapply(str_split(row.names(strataa_metaphlan_data), "\\|"), function(x) x[length(x)])

strataa_metaphlan_metadata <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.strataa_metadata.metaphlan.csv'), header = TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE)
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(SampleID = rownames(strataa_metaphlan_metadata))
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(age_bracket=cut(Age, breaks=c(0, 1, 5, 15, Inf), labels=c("<1", "1-5", "6-15", ">15")))

1.5 Alpha diversity - metaphlan

Alpha diversity - all countries, all groups

all_countries_all_groups_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Control_HealthySerosurvey', 'Carrier')))

all_countries_all_groups_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country:Group 4 7.83 1.96 12.4 3.09e-09 significant
Group 2 5.92 2.96 18.7 2.55e-08 significant
Country 2 2.75 1.38 8.7 0.00022 significant
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.83 0.913 5.77 0.00352 significant
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.25 0.625 3.95 0.0204 not_significant
Sex:Group 2 1.13 0.566 3.58 0.0292 not_significant
Country:Sex 2 1.11 0.553 3.49 0.0318 not_significant
Country:Sex:Age 2 1.03 0.517 3.27 0.0396 not_significant
Country:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.835 0.417 2.64 0.0733 not_significant
Country:Age 2 0.576 0.288 1.82 0.164 not_significant
Country:Group:Age 4 1.03 0.256 1.62 0.169 not_significant
Sex:Age 1 0.298 0.298 1.88 0.171 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.474 0.237 1.5 0.226 not_significant
Country:Sex:Group 4 0.817 0.204 1.29 0.274 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.318 0.159 1.01 0.367 not_significant
Group:Age 2 0.239 0.119 0.754 0.471 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0529 0.0529 0.334 0.564 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0476 0.0476 0.301 0.584 not_significant
Sex 1 0.0294 0.0294 0.186 0.667 not_significant
Sex:Group:Age 2 0.051 0.0255 0.161 0.851 not_significant
Country:Sex:Group:Age 4 0.127 0.0317 0.201 0.938 not_significant
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.000136 0.000136 0.000861 0.977 not_significant
Age 1 1.07e-08 1.07e-08 6.75e-08 1 not_significant
Residuals 261 41.3 0.158 NA NA NA
all_countries_all_groups_alpha$alpha_plot_group

all_countries_all_groups_alpha$alpha_plot_antibiotics

Alpha diversity - all countries, healthy and acute

all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

all_countries_healthy_acute_alpha$alpha_by_country

all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country 2 5.74 2.87 17.4 1.36e-07 significant
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.68 0.842 5.12 0.00699 significant
Sex:Age 1 1.07 1.07 6.5 0.0117 not_significant
Country:Sex 2 1.5 0.75 4.56 0.0119 not_significant
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.39 0.693 4.21 0.0165 not_significant
Country:Age 2 1.02 0.51 3.1 0.0477 not_significant
Sex:Group 1 0.628 0.628 3.82 0.0525 not_significant
Country:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.816 0.408 2.48 0.0871 not_significant
Country:Group 2 0.698 0.349 2.12 0.123 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.618 0.309 1.88 0.157 not_significant
Group 1 0.164 0.164 0.997 0.319 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.359 0.179 1.09 0.339 not_significant
Sex 1 0.0592 0.0592 0.36 0.55 not_significant
Country:Sex:Age 2 0.188 0.0938 0.57 0.567 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0511 0.0511 0.31 0.578 not_significant
Sex:Group:Age 1 0.0368 0.0368 0.224 0.637 not_significant
Group:Age 1 0.0147 0.0147 0.0891 0.766 not_significant
Country:Group:Age 2 0.0713 0.0357 0.217 0.805 not_significant
Age 1 0.00772 0.00772 0.0469 0.829 not_significant
Country:Sex:Group 2 0.0595 0.0297 0.181 0.835 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.00675 0.00675 0.041 0.84 not_significant
Country:Sex:Group:Age 2 0.05 0.025 0.152 0.859 not_significant
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.000136 0.000136 0.000828 0.977 not_significant
Residuals 163 26.8 0.165 NA NA NA
all_countries_healthy_acute_alpha$alpha_plot_group

all_countries_healthy_acute_alpha$alpha_plot_antibiotics

alpha diversity, malawi only

malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 2.27 2.27 16 0.000167 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 1 1.31 1.31 9.27 0.0034 significant
Sex 1 1.16 1.16 8.21 0.00567 significant
Group 1 0.84 0.84 5.93 0.0177 not_significant
Age 1 0.451 0.451 3.19 0.079 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.406 0.406 2.87 0.0952 not_significant
Sex:Group 1 0.313 0.313 2.21 0.142 not_significant
Sex:Group:Age 1 0.0978 0.0978 0.691 0.409 not_significant
Sex:Age 1 0.0164 0.0164 0.116 0.734 not_significant
Group:Age 1 0.00122 0.00122 0.00864 0.926 not_significant
Residuals 63 8.91 0.141 NA NA NA
malawi_healthy_acute_alpha$alpha_plot_group

malawi_healthy_acute_alpha$alpha_plot_antibiotics

1.6 Beta diversity - metaphlan

All groups.

all_countries_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0759082 0.0009990 significant
Group:Age 0.0121511 0.0029970 significant
Sex:Group:Age 0.0081783 0.0909091 not_significant
Sex:Group 0.0079033 0.1238761 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0076947 0.1348651 not_significant
Age 0.0038070 0.1668332 not_significant
Sex 0.0035891 0.2147852 not_significant
Sex:Age 0.0035425 0.2227772 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0035541 0.2247752 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0027831 0.4545455 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0045854 0.7862138 not_significant
Total 1.0000000 NA NA
Residual 0.8663032 NA NA
bgd_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
bgd_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.2679584 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0124851 0.0449550 not_significant
Sex:Age 0.0093987 0.1278721 not_significant
Age 0.0084979 0.1688312 not_significant
Sex:Group 0.0155821 0.1788212 not_significant
Sex 0.0078067 0.2357642 not_significant
Group:Age 0.0115137 0.5144855 not_significant
Sex:Group:Age 0.0104905 0.6063936 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0047215 0.6433566 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0044103 0.7022977 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0035306 0.8121878 not_significant
Total 1.0000000 NA NA
Residual 0.6436043 NA NA
# bgd_beta_all_groups$pc12
# bgd_beta_all_groups$pc34

mal_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
mal_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1942771 0.0009990 significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0225358 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0247209 0.0019980 significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0169857 0.0089910 significant
Group:Age 0.0223640 0.0209790 not_significant
Sex:Age 0.0138711 0.0229770 not_significant
Sex:Group 0.0217097 0.0269730 not_significant
Age 0.0114622 0.0489510 not_significant
Sex 0.0075054 0.2667333 not_significant
Sex:Group:Age 0.0129653 0.4355644 not_significant
Total 1.0000000 NA NA
Residual 0.6516027 NA NA
# mal_beta_all_groups$pc12
# mal_beta_all_groups$pc34

nep_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
nep_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0898195 0.0009990 significant
Sex 0.0176933 0.1408591 not_significant
Sex:Age 0.0148693 0.2667333 not_significant
Age 0.0125478 0.4395604 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0124906 0.4835165 not_significant
Sex:Group 0.0239383 0.5904096 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0106433 0.6063936 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0228450 0.6373626 not_significant
Sex:Group:Age 0.0169339 0.9370629 not_significant
Group:Age 0.0162360 0.9750250 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0102948 1.0000000 not_significant
Total 1.0000000 NA NA
Residual 0.7516884 NA NA
all_countries_beta_all_groups$pc12 / bgd_beta_all_groups$pc12 / nep_beta_all_groups$pc12

Acute vs healthy.

all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0285252 0.0009990 significant
Group:Age 0.0117687 0.0109890 not_significant
Age 0.0092014 0.0419580 not_significant
Sex 0.0070879 0.1118881 not_significant
Sex:Group:Age 0.0072077 0.1288711 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0117438 0.2187812 not_significant
Sex:Age 0.0058394 0.2197802 not_significant
Sex:Group 0.0057923 0.2507493 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0055130 0.2817183 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0043096 0.4985015 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0070480 0.8281718 not_significant
Total 1.0000000 NA NA
Residual 0.8959630 NA NA
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0782718 0.0009990 significant
Sex 0.0230592 0.0339660 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0199564 0.0689311 not_significant
Sex:Age 0.0168558 0.1618382 not_significant
Age 0.0146689 0.2337662 not_significant
Group:Age 0.0109377 0.4715285 not_significant
Sex:Group:Age 0.0100500 0.5704296 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0080933 0.7372627 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0074906 0.8221778 not_significant
Sex:Group 0.0067483 0.9100899 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0059228 0.9200799 not_significant
Total 1.0000000 NA NA
Residual 0.7979452 NA NA
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1995900 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0381931 0.0009990 significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0371210 0.0019980 significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0267524 0.0069930 significant
Sex:Group 0.0200496 0.0239760 not_significant
Age 0.0194908 0.0429570 not_significant
Sex:Age 0.0144797 0.1248751 not_significant
Sex 0.0141068 0.1508492 not_significant
Sex:Group:Age 0.0091754 0.4685315 not_significant
Group:Age 0.0078947 0.5994006 not_significant
Total 1.0000000 NA NA
Residual 0.6131466 NA NA
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0598990 0.0039960 significant
Sex 0.0484279 0.0139860 not_significant
Sex:Group 0.0246755 0.2857143 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0450942 0.3516484 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0244159 0.3516484 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0207951 0.4975025 not_significant
Sex:Group:Age 0.0175717 0.6633367 not_significant
Age 0.0154038 0.7792208 not_significant
Sex:Age 0.0148604 0.8621379 not_significant
Group:Age 0.0102689 0.9850150 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0188359 0.9990010 not_significant
Total 1.0000000 NA NA
Residual 0.6997518 NA NA
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12

1.7 Plot phyla bar plots

# get the taxa that are phyla, not classes, or below (species etc), and tidy the data.
phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")

# relative_abundance > 1 returns a list of TRUE/FALSE values, which is then summed to get the number of samples in which the phylum is present at > 1% relative abundance.
# then we filter to only keep phyla that are present at 1% in at least 10% of samples.
phyla_to_exclude <- phyla %>% group_by(clade_name) %>% 
    summarise(count = sum(relative_abundance > 1)) %>% 
    filter(count < (length(unique(phyla$sample)) / 10)) %>% 
    pull(clade_name)
View(phyla_to_exclude)

# in order to make each sample add up to 100, we need to add the excluded taxa back in as a single "rare taxa" phylum.
# first we need to calculate the relative abundance of the excluded taxa in each sample.
excluded_phyla <- phyla %>%
  filter(clade_name %in% phyla_to_exclude) %>% group_by(sample) %>% summarise(relative_abundance = sum(relative_abundance))
# then make a column with the name "rare_taxa" for each sample, annd bind it to the excluded taxa data.
rare_taxa_column <- data.frame(lowest_taxonomic_level = c(rep("rare_taxa", nrow(excluded_phyla))), clade_name = c(rep("rare_taxa", nrow(excluded_phyla))))
excluded_phyla <- cbind(rare_taxa_column, excluded_phyla)

# then remove the excluded taxa from the phyla data.
phyla_clean <- phyla %>%
  filter(!(clade_name %in% phyla_to_exclude))
# and add the excluded taxa back in.
phyla_clean <- rbind(phyla_clean, excluded_phyla)
View(phyla_clean)
View(excluded_phyla)

colnames(strataa_metaphlan_metadata)
## [1] "Group"                                               
## [2] "Sex"                                                 
## [3] "Country"                                             
## [4] "Age"                                                 
## [5] "Antibiotics_taken_before_sampling_yes_no_assumptions"
## [6] "sequencing_lane"                                     
## [7] "SampleID"                                            
## [8] "age_bracket"
metadata_select <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)


phyla_clean_metadata <- phyla_clean %>% left_join(metadata_select, by = c("sample" = "SampleID"))
phyla_clean_metadata <- phyla_clean_metadata %>% mutate(Group = ifelse(Group == "Acute_Typhi", "Typhi", Group)) %>% mutate(Group = ifelse(Group == "Control_HealthySerosurvey", "Healthy", Group))

order_of_groups <- c("Typhi", "Healthy")

plot_per_country_abundance <- function(phyla_clean_metadata, country, group_order){
    # thanks chatgpt!
    phyla_clean_country <- phyla_clean_metadata %>% filter(Country == country) %>% filter(Group %in% group_order)

    phyla_clean_country_fct <- phyla_clean_country %>%
    mutate(Group = factor(Group, levels = group_order),  # Convert group to a factor with the desired order
            group_order_numeric = as.numeric(Group),  # Create a new numeric variable based on the order of group
            sample = fct_reorder(sample, group_order_numeric))  # Reorder sample based on group_order_numeric

    p <- ggplot(data = phyla_clean_country_fct, aes(x = sample, y = relative_abundance, fill = clade_name)) + 
        geom_bar(stat = "identity") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank()) + 
        scale_x_discrete(breaks=phyla_clean_country_fct$sample, labels=phyla_clean_country_fct$Group) + 
        ggtitle(country) +
        ylim(0, 100.01)
    p
}

bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)

bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot

# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_select, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from =  'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
clade_name Bangladesh_Acute_Typhi Bangladesh_Control_HealthySerosurvey Malawi_Acute_Typhi Malawi_Control_HealthySerosurvey Nepal_Acute_Typhi Nepal_Control_HealthySerosurvey
k__Archaea|p__Candidatus_Thermoplasmatota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Archaea|p__Euryarchaeota 0.000000 0.000000 0.249010 0.000000 0.00000 0.22474
k__Archaea|p__Thaumarchaeota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Acidobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Actinobacteria 14.361185 13.136440 1.893725 0.524965 3.33634 3.81550
k__Bacteria|p__Bacteria_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Bacteroidetes 0.911970 2.216730 15.361165 34.630185 28.24679 20.53695
k__Bacteria|p__Candidatus_Melainabacteria 0.000000 0.000000 0.038085 0.052210 0.00000 0.02157
k__Bacteria|p__Candidatus_Saccharibacteria 0.017310 0.015625 0.000000 0.000000 0.00000 0.00203
k__Bacteria|p__Chloroflexi 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Elusimicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Firmicutes 80.670075 72.161565 60.161350 58.311770 53.74090 60.17250
k__Bacteria|p__Fusobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Lentisphaerae 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Proteobacteria 0.629715 1.272295 6.606440 6.195335 2.90174 2.52509
k__Bacteria|p__Spirochaetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Synergistetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Tenericutes 0.000000 0.000000 0.002715 0.000000 0.01965 0.01362
k__Bacteria|p__Verrucomicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Ascomycota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Eukaryota_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000

1.8 Taxa associated with participant groups

1.8.1 basics, stats and plots

Maaslin basics

groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0

There were 296 species significantly (q-val < 0.05) associated with health/disease in Malawi, in Bangladesh, and in Nepal.

Combine the taxonomic maaslins, and print out the species that are sig in both and share directions.

Because sequencing run and participant type are totally confounded for Bangladesh, need to exclude sequencing run from the final model for Bangladesh (otherwise, wipes out the signals).

between acute <-> health.

### associated at both sites

combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179 s__Clostridium_SGB6179 8.79 1.34 4.25e-05 3.11 0.936 0.0286
k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A s__Prevotella_copri_clade_A 4.54 0.978 0.00971 7.64 1.25 1.21e-05
k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809 s__GGB4266_SGB5809 4.58 1.09 0.0147 6.94 0.981 4.38e-07
k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae s__Haemophilus_parainfluenzae 3.64 0.979 0.03 6.92 0.953 2.26e-07
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis s__Romboutsia_timonensis 4.52 1.25 0.0386 5.07 0.903 5.91e-05
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium s__Lachnospiraceae_bacterium -3.05 0.838 0.0366 -2.87 0.785 0.0135
nrow(combined_results$mwi_maaslin_only)
## [1] 290
nrow(combined_results$bang_maaslin_only)
## [1] 17

There were 290 species significantly (q-val < 0.05) associated with health/disease in Malawi only and 17 in Bangladesh only.

### associated at only one site

The ones associated at only one site are written out to a file, you can look at them manually there.

1.8.2 plots of species of interest

strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))


pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')

cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')

SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')

hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')

rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')

lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')

# pc + cs + SGB5809 + hp + rt + lb

1.9 P. copri clades

Print the results for all the P. copri clades

all_taxa_maaslin <- combined_maaslins$all_features
p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))

1.10 maaslin functional groups

Functional groups associated with health

bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')

bang_func_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
combined_results <- run_combine_maaslins(bang_func_maaslin, mal_func_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)

combined_results$positive_coef %>%  kbl() %>% kable_styling()
MGC_class Species feature metadata value coef_bang stderr_bang N_bang N.not.0_bang pval_bang qval_bang coef_mal stderr_mal N_mal N.not.0_mal pval_mal qval_mal
succinate2propionate Dialister_invisus_DSM_15470_genomic_scaffold gb.GG698602.1.region002.GC_DNA..Entryname.succinate2propionate..OS.Dialister_invisus_DSM_15470_genomic_scaffold..SMASHregion.region002..NR.1 Group Control_HealthySerosurvey 4.487491 0.7685363 80 57 0.0000001 0.0002378 4.319261 0.9371688 55 44 0.0000334 0.0019648
Pyruvate2acetate.formate Prevotella_copri_strain_AM22.19 gb.QRIF01000001.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.7 Group Control_HealthySerosurvey 4.947861 0.8647077 80 73 0.0000002 0.0002593 4.763907 1.2291770 55 50 0.0003426 0.0122273
TPP_AA_metabolism Prevotella_sp._S7_MS_2 gb.JRPT01000001.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Prevotella_sp._S7_MS_2..SMASHregion.region001..NR.11..BG.2 Group Control_HealthySerosurvey 4.861273 0.8899833 80 75 0.0000006 0.0003811 5.135414 1.1610044 55 52 0.0000610 0.0031362
Arginine2_Hcarbonate .Clostridium._sordellii_genome_assembly gb.LN679998.1.region007.GC_DNA..Entryname.Arginine2_Hcarbonate..OS..Clostridium._sordellii_genome_assembly..SMASHregion.region007..NR.2 Group Control_HealthySerosurvey 4.586442 0.8235724 80 62 0.0000004 0.0003811 3.511901 0.7321910 55 42 0.0000181 0.0011569
Others_HGD_unassigned Prevotella_copri_strain_AF24.12 gb.QRVA01000039.1.region001.GC_DNA..Entryname.Others_HGD_unassigned..OS.Prevotella_copri_strain_AF24.12..SMASHregion.region001..NR.6 Group Control_HealthySerosurvey 4.723351 0.8622062 80 74 0.0000005 0.0003811 5.742395 1.0935772 55 51 0.0000040 0.0003585
Pyruvate2acetate.formate Prevotella_sp._AM34.19LB gb.QSIG01000002.1.region002.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_sp._AM34.19LB..SMASHregion.region002..NR.3 Group Control_HealthySerosurvey 4.247639 0.7826518 80 73 0.0000007 0.0003811 3.278032 0.9556571 55 48 0.0013025 0.0335603
TPP_fatty_acids Prevotella_copri_strain_AM22.19 gb.QRIF01000003.1.region001.GC_DNA..Entryname.TPP_fatty_acids..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.6 Group Control_HealthySerosurvey 5.074953 0.9522039 80 72 0.0000010 0.0004225 5.334982 1.0684820 55 52 0.0000094 0.0007055
Rnf_complex Prevotella_copri_strain_AF38.11 gb.QROP01000022.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Prevotella_copri_strain_AF38.11..SMASHregion.region001..NR.6 Group Control_HealthySerosurvey 4.736879 0.9139504 80 72 0.0000018 0.0006485 5.228857 1.1248300 55 51 0.0000293 0.0017618
PFOR_II_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 4.841984 0.9834626 80 63 0.0000049 0.0010417 2.979921 0.7187059 55 44 0.0001475 0.0062890
Others_HGD_unassigned Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region003.GC_DNA..Entryname.Others_HGD_unassigned..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region003..NR.1 Group Control_HealthySerosurvey 4.591409 0.9348343 80 67 0.0000052 0.0010472 2.761655 0.5584659 55 39 0.0000110 0.0008116
porA Prevotella_copri_strain_AF10.17 gb.QSAV01000013.1.region001.GC_DNA..Entryname.porA..OS.Prevotella_copri_strain_AF10.17..SMASHregion.region001..NR.6 Group Control_HealthySerosurvey 4.459775 0.9181644 80 73 0.0000063 0.0012403 5.078009 1.1688664 55 51 0.0000786 0.0038673
Rnf_complex Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region002.GC_DNA..Entryname.Rnf_complex..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region002..NR.1 Group Control_HealthySerosurvey 4.363720 0.9569944 80 60 0.0000196 0.0031040 2.830776 0.6216771 55 40 0.0000400 0.0022830
PFOR_II_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region004.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region004..NR.1 Group Control_HealthySerosurvey 4.389654 0.9675717 80 69 0.0000213 0.0031830 3.365707 0.8138609 55 45 0.0001526 0.0064576
Formate_dehydrogenase Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV837963.1.region001.GC_DNA..Entryname.Formate_dehydrogenase..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 Group Control_HealthySerosurvey 3.408756 0.7538940 80 47 0.0000225 0.0032720 4.891279 0.9853786 55 45 0.0000104 0.0007674
Pyruvate2acetate.formate.Acetyl.CoA_pathway Romboutsia_ilealis_strain_CRIB_genome gb.LN555523.1.region005.GC_DNA..Entryname.Pyruvate2acetate.formate.Acetyl.CoA_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region005..NR.1 Group Control_HealthySerosurvey 4.255279 0.9998841 80 74 0.0000595 0.0072007 4.577294 1.1095387 55 51 0.0001575 0.0065929
Pyruvate2acetate.formate Clostridium_carboxidivorans gb.CP011803.1.region013.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_carboxidivorans..SMASHregion.region013..NR.1 Group Control_HealthySerosurvey 3.692280 0.8904588 80 62 0.0000880 0.0095422 3.111543 0.7615447 55 41 0.0001784 0.0073319
OD_eut_pdu_related.PFOR_II_pathway Paeniclostridium_sordellii_strain_AM370 gb.CP014150.1.region011.GC_DNA..Entryname.OD_eut_pdu_related.PFOR_II_pathway..OS.Paeniclostridium_sordellii_strain_AM370..SMASHregion.region011..NR.4 Group Control_HealthySerosurvey 2.611181 0.6285831 80 48 0.0000856 0.0095422 3.944312 0.5736462 55 34 0.0000000 0.0000046
TPP_AA_metabolism.Arginine2putrescine Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV837955.1.region001.GC_DNA..Entryname.TPP_AA_metabolism.Arginine2putrescine..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 Group Control_HealthySerosurvey 3.928049 0.9505741 80 52 0.0000925 0.0095901 4.391415 0.9179545 55 49 0.0000188 0.0011989
Respiratory_glycerol Haemophilus_parainfluenzae_HK2019 gb.AJTC01000042.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Haemophilus_parainfluenzae_HK2019..SMASHregion.region001..NR.5 Group Control_HealthySerosurvey 3.546442 0.8817680 80 55 0.0001363 0.0135794 5.139593 0.9752294 55 47 0.0000037 0.0003443
Pyruvate2acetate.formate Haemophilus_parainfluenzae_HK262 gb.AJMW01000041.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.5 Group Control_HealthySerosurvey 3.168962 0.7913820 80 49 0.0001450 0.0136983 5.084917 1.0124480 55 47 0.0000085 0.0006570
TPP_AA_metabolism Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291615.1.region002.GC_DNA..Entryname.TPP_AA_metabolism..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region002..NR.1 Group Control_HealthySerosurvey 3.243306 0.8125781 80 49 0.0001517 0.0137569 2.499383 0.7198030 55 35 0.0011514 0.0303032
Arginine2putrescine.Putrescine2spermidine Romboutsia_sp._Frifi_strain_FRIFI_genome gb.LN650648.1.region002.GC_DNA..Entryname.Arginine2putrescine.Putrescine2spermidine..OS.Romboutsia_sp._Frifi_strain_FRIFI_genome..SMASHregion.region002..NR.1 Group Control_HealthySerosurvey 2.663903 0.6834368 80 49 0.0002095 0.0177351 2.571396 0.5227425 55 37 0.0000120 0.0008682
Pyruvate2acetate.formate Haemophilus_haemolyticus_HK386 gb.AJSV01000030.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_haemolyticus_HK386..SMASHregion.region001..NR.2 Group Control_HealthySerosurvey 2.378368 0.6314161 80 25 0.0003269 0.0233914 5.021323 0.7241403 55 37 0.0000000 0.0000040
Pyruvate2acetate.formate Haemophilus_aegyptius_ATCC_11116_genomic_scaffold gb.GL878527.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_aegyptius_ATCC_11116_genomic_scaffold..SMASHregion.region001..NR.2 Group Control_HealthySerosurvey 2.158465 0.5739901 80 29 0.0003339 0.0235574 3.518440 0.6789089 55 37 0.0000050 0.0004277
acetate2butyrate Clostridium_botulinum_B_str._Eklund gb.CP001056.1.region002.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region002..NR.5..BG.2 Group Control_HealthySerosurvey 2.416257 0.6459628 80 33 0.0003569 0.0245034 2.529664 0.7542482 55 33 0.0016248 0.0395537
OD_fatty_acids Dialister_succinatiphilus_YIT_11850_genomic_scaffold gb.JH591188.1.region001.GC_DNA..Entryname.OD_fatty_acids..OS.Dialister_succinatiphilus_YIT_11850_genomic_scaffold..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 2.937359 0.7931904 80 40 0.0004043 0.0264919 2.542256 0.5789555 55 36 0.0000676 0.0034168
Pyruvate2acetate.formate Clostridium_butyricum_strain_JKY6D1_chromosome gb.CP013352.1.region004.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_butyricum_strain_JKY6D1_chromosome..SMASHregion.region004..NR.6 Group Control_HealthySerosurvey 3.461246 0.9377451 80 59 0.0004211 0.0265098 3.345147 0.9358449 55 39 0.0008517 0.0240541
TPP_AA_metabolism Haemophilus_sputorum_HK_2154 gb.ALJP01000004.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_sputorum_HK_2154..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 2.955534 0.8121614 80 40 0.0005000 0.0298840 4.518892 0.8662388 55 39 0.0000045 0.0003922
Fumarate2succinate Haemophilus_sp._HMSC61B11_genomic_scaffold gb.KV838003.1.region001.GC_DNA..Entryname.Fumarate2succinate..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5 Group Control_HealthySerosurvey 3.338659 0.9243056 80 44 0.0005465 0.0311936 4.623872 1.0122075 55 46 0.0000382 0.0021980
Respiratory_glycerol Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold gb.KE952617.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 2.906426 0.8078251 80 41 0.0005726 0.0323199 4.637692 0.9258834 55 44 0.0000089 0.0006780
TPP_AA_metabolism Haemophilus_pittmaniae_HK_85 gb.AFUV01000010.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_pittmaniae_HK_85..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 2.425728 0.6864843 80 40 0.0007059 0.0352706 5.215879 0.8956435 55 44 0.0000006 0.0000817
PFOR_II_pathway Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291660.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 3.513096 0.9944875 80 64 0.0007082 0.0352706 3.767830 0.9815067 55 41 0.0003837 0.0132101
acetate2butyrate Clostridium_celatum_DSM_1785_genomic_scaffold gb.KB291615.1.region001.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1 Group Control_HealthySerosurvey 3.410399 0.9814122 80 64 0.0008526 0.0397344 3.413463 0.9314921 55 43 0.0006509 0.0197440
Rnf_complex Haemophilus_parainfluenzae_HK262 gb.AJMW01000064.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.6..BG.2 Group Control_HealthySerosurvey 2.804854 0.8137384 80 43 0.0009328 0.0417717 5.002957 0.9924891 55 44 0.0000080 0.0006358
OD_fatty_acids.PFOR_II_pathway Clostridium_botulinum_B_str._Eklund gb.CP001056.1.region005.GC_DNA..Entryname.OD_fatty_acids.PFOR_II_pathway..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region005..NR.5..BG.2 Group Control_HealthySerosurvey 2.572884 0.7716623 80 41 0.0013315 0.0497369 3.161364 0.8345813 55 31 0.0004481 0.0148068
combined_results$negative_coef %>%  kbl() %>% kable_styling()
MGC_class Species feature metadata value coef_bang stderr_bang N_bang N.not.0_bang pval_bang qval_bang coef_mal stderr_mal N_mal N.not.0_mal pval_mal qval_mal
NADH_dehydrogenase_I Actinomyces_viscosus_C505_genomic_scaffold gb.KI391968.1.region002.GC_DNA..Entryname.NADH_dehydrogenase_I..OS.Actinomyces_viscosus_C505_genomic_scaffold..SMASHregion.region002..NR.5 Group Control_HealthySerosurvey -3.007475 0.5744596 80 74 0.0000015 0.0005676 -2.902767 0.7955322 55 48 0.0006822 0.0205316
Nitrate_reductase Actinomyces_johnsonii_F0510_genomic_scaffold gb.KE951633.1.region001.GC_DNA..Entryname.Nitrate_reductase..OS.Actinomyces_johnsonii_F0510_genomic_scaffold..SMASHregion.region001..NR.2 Group Control_HealthySerosurvey -2.757659 0.5439238 80 67 0.0000028 0.0008322 -2.257511 0.6572948 55 23 0.0012858 0.0332056
Rnf_complex.TPP_AA_metabolism Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic gb.DS499548.1.region001.GC_DNA..Entryname.Rnf_complex.TPP_AA_metabolism..OS.Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic..SMASHregion.region001..NR.2 Group Control_HealthySerosurvey -2.443337 0.7272441 80 20 0.0012292 0.0469508 -4.287272 1.2309921 55 43 0.0011166 0.0297739
combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>%  kbl() %>% kable_styling()
MGC_class n Species
Pyruvate2acetate.formate 7 Clostridium butyricum, Clostridium carboxidivorans, Haemophilus aegyptius, Haemophilus haemolyticus, Haemophilus parainfluenzae, Prevotella copri, Prevotella sp.
TPP_AA_metabolism 4 Clostridium celatum, Haemophilus pittmaniae, Haemophilus sputorum, Prevotella sp.
PFOR_II_pathway 3 Clostridium celatum, Romboutsia ilealis, Romboutsia ilealis
Rnf_complex 3 Haemophilus parainfluenzae, Prevotella copri, Romboutsia ilealis
Others_HGD_unassigned 2 Prevotella copri, Romboutsia ilealis
Respiratory_glycerol 2 Aggregatibacter sp., Haemophilus parainfluenzae
acetate2butyrate 2 Clostridium botulinum, Clostridium celatum
Arginine2_Hcarbonate 1 .Clostridium. sordellii
Arginine2putrescine.Putrescine2spermidine 1 Romboutsia sp.
Formate_dehydrogenase 1 Haemophilus sp.
Fumarate2succinate 1 Haemophilus sp.
OD_eut_pdu_related.PFOR_II_pathway 1 Paeniclostridium sordellii
OD_fatty_acids 1 Dialister succinatiphilus
OD_fatty_acids.PFOR_II_pathway 1 Clostridium botulinum
Pyruvate2acetate.formate.Acetyl.CoA_pathway 1 Romboutsia ilealis
TPP_AA_metabolism.Arginine2putrescine 1 Haemophilus sp.
TPP_fatty_acids 1 Prevotella copri
porA 1 Prevotella copri
succinate2propionate 1 Dialister invisus
write_csv(combined_results$positive_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_positive_coef.csv'))
write_csv(combined_results$negative_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_negative_coef.csv'))
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
mwi_variables_for_analysis <- c("Group", "Sex", "Age")
bang_variables_for_analysis <- c("Group", "Sex", "Age")
run_combine_maaslins(groups_to_analyse, mwi_variables_for_analysis, bang_variables_for_analysis, maaslin_functional_output_root_folder, 'bigmap')